Import data

# === Import data & preprocessing ===
customer_raw <- fread("./data/marketing_campaign.csv")

alone_status <- c("Widow", "Single", "Divorced", "Alone", "Absurd")
customer <- customer_raw %>% select(-c(8, 27:28)) %>% 
  # for missing
  mutate(Income = ifelse(is.na(Income), round(mean(Income, na.rm = TRUE)), Income),
         Age = 2022 - Year_Birth,
         AcceptedComp = ifelse(rowSums(across(starts_with("AcceptedCmp"))) + Response > 0, 1, 0),
         Education = recode(Education, Basic = "Below some college", `2n Cycle` = "Associate degree",
                            Graduation = "Bachelor's degree", Master = "Master's degree", PhD = "Doctoral degree")
         ) %>% 
  filter(Marital_Status != "YOLO") %>% 
  select(1, 27, 3:8, 25, 9:14, 15, 28, 16:19) %>% 
  mutate_at(c(3:4, 9, 17), .funs = ~as.factor(.))

# Import original data & sample 0.5%

# bf_dat <- fread("./data/data-final.csv")
# chose_ind <- sample(c(1:nrow(bf_dat)), size = round(nrow(bf_dat) * 0.002), replace = FALSE)
# bf <- bf_dat[chose_ind, ]
# write_csv(bf, "./data/big_five.csv")
big_five_raw <- fread("./data/big_five.csv")

country_df <- countrycode::codelist %>% 
  select(3, 6, 17)
colnames(country_df)[3] <- "country"
big_five_raw <- left_join(big_five_raw, country_df, by = "country") %>% 
  select(-c(51:104, 106, 108))
big_five_raw <- big_five_raw %>% 
  mutate(num_zero = rowSums(big_five_raw[, 1:50] == 0)) %>% 
  filter(num_zero == 0) %>% select(-57)

big_five <- big_five_raw %>% 
  filter(IPC == 1) %>% 
  mutate_at(c(1:50, 55:56), .funs = ~as.factor(.)) %>% 
  mutate(
    testlapse = as.numeric(testelapse),
    lat = as.numeric(lat_appx_lots_of_err),
    long = as.numeric(long_appx_lots_of_err),
    country = country.name.en
    ) %>% 
  select(-c(51:54, 56)) %>% 
  na.omit()

big_five <- big_five %>% mutate(id = seq(1:nrow(big_five))) %>% 
  relocate(id, .before = EXT1)

EDA

# --- customer ---
# Check correlation
check_cor <- model.matrix(~., customer)[, -1]
corrplot(cor(check_cor[, -1]), method = "circle", type = "full",
         tl.cex = 0.75, tl.col = "black")

# Some demogrpahics
summary(customer)
##        ID             Age                     Education     Marital_Status
##  Min.   :    0   Min.   : 26.0   Associate degree  : 203   Absurd  :  2   
##  1st Qu.: 2830   1st Qu.: 45.0   Bachelor's degree :1127   Alone   :  3   
##  Median : 5458   Median : 52.0   Below some college:  54   Divorced:232   
##  Mean   : 5592   Mean   : 53.2   Doctoral degree   : 484   Married :864   
##  3rd Qu.: 8425   3rd Qu.: 63.0   Master's degree   : 370   Single  :480   
##  Max.   :11191   Max.   :129.0                             Together:580   
##                                                            Widow   : 77   
##      Income          Kidhome          Teenhome         Recency      Complain
##  Min.   :  1730   Min.   :0.0000   Min.   :0.0000   Min.   : 0.00   0:2217  
##  1st Qu.: 35528   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:24.00   1:  21  
##  Median : 51790   Median :0.0000   Median :0.0000   Median :49.00           
##  Mean   : 52251   Mean   :0.4446   Mean   :0.5058   Mean   :49.15           
##  3rd Qu.: 68307   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:74.00           
##  Max.   :666666   Max.   :2.0000   Max.   :2.0000   Max.   :99.00           
##                                                                             
##     MntWines         MntFruits      MntMeatProducts  MntFishProducts 
##  Min.   :   0.00   Min.   :  0.00   Min.   :   0.0   Min.   :  0.00  
##  1st Qu.:  23.25   1st Qu.:  1.00   1st Qu.:  16.0   1st Qu.:  3.00  
##  Median : 173.00   Median :  8.00   Median :  67.0   Median : 12.00  
##  Mean   : 303.92   Mean   : 26.32   Mean   : 167.1   Mean   : 37.56  
##  3rd Qu.: 504.75   3rd Qu.: 33.00   3rd Qu.: 232.0   3rd Qu.: 50.00  
##  Max.   :1493.00   Max.   :199.00   Max.   :1725.0   Max.   :259.00  
##                                                                      
##  MntSweetProducts  MntGoldProds    NumDealsPurchases AcceptedComp
##  Min.   :  0.00   Min.   :  0.00   Min.   : 0.000    0:1630      
##  1st Qu.:  1.00   1st Qu.:  9.00   1st Qu.: 1.000    1: 608      
##  Median :  8.00   Median : 24.00   Median : 2.000                
##  Mean   : 27.08   Mean   : 44.02   Mean   : 2.323                
##  3rd Qu.: 33.00   3rd Qu.: 56.00   3rd Qu.: 3.000                
##  Max.   :263.00   Max.   :362.00   Max.   :15.000                
##                                                                  
##  NumWebPurchases  NumCatalogPurchases NumStorePurchases NumWebVisitsMonth
##  Min.   : 0.000   Min.   : 0.000      Min.   : 0.00     Min.   : 0.000   
##  1st Qu.: 2.000   1st Qu.: 0.000      1st Qu.: 3.00     1st Qu.: 3.000   
##  Median : 4.000   Median : 2.000      Median : 5.00     Median : 6.000   
##  Mean   : 4.082   Mean   : 2.664      Mean   : 5.79     Mean   : 5.314   
##  3rd Qu.: 6.000   3rd Qu.: 4.000      3rd Qu.: 8.00     3rd Qu.: 7.000   
##  Max.   :27.000   Max.   :28.000      Max.   :13.00     Max.   :20.000   
## 
customer %>% 
  group_by(Education, Marital_Status) %>% 
  summarise(tot = n()) %>% 
  ggplot(aes(x = tot, y = Education, fill = Marital_Status)) +
  geom_bar(stat = "identity") + theme_bw() +
  scale_fill_brewer(palette = "RdYlGn") +
  labs(
    x = "Number of participants",
    y = "Education",
    title = "Frequency by Marital Status & Education"
  )

# Total demographics
tbl_summary(dat = customer %>% 
              mutate(Complain = factor(Complain, levels = c(0, 1), labels = c("No", "Yes")),
                     AcceptedComp = factor(AcceptedComp, levels = c(0, 1), labels = c("No", "Yes"))) %>% 
              select(-1),
            label = list(Marital_Status ~ "Marital Status", Kidhome ~ "Number of Kids",
                         Teenhome ~ "Number of Teens", Recency ~ "Number of Days Since Last Purchase",
                         Complain ~ "Complaint (in last 2 yrs)",
                         
                         # products
                         MntWines ~ "Wine", MntFruits ~ "Fruits", MntMeatProducts ~ "Meat Products",
                         MntFishProducts ~ "Fish Products", MntSweetProducts ~ "Sweet Products",
                         MntGoldProds ~ "Gold Products", 
                         
                         # campaign & place
                         NumDealsPurchases ~ "Number of Purchases made with a discount", 
                         AcceptedComp ~ "Customer Accepted the Offer",
                         NumWebPurchases ~ "Through the Company's Website", NumCatalogPurchases ~ "Using a Catelogue",
                         NumStorePurchases ~ "Directlt in Stores", NumWebVisitsMonth ~ "Website Visits (in last month)")) %>% 
  modify_table_styling(footnote = "Amount spent on a type of product in last two years",
                       rows = (label == "Wine"), columns = label) %>% 
  modify_table_styling(footnote = "Number of purchases made through a way",
                       rows = (label == "Through the Company's Website"),
                       columns = label) %>% 
  modify_caption("**Demographics of Customer (n=2238)**") %>% 
  bold_labels()
Demographics of Customer (n=2238)
Characteristic N = 2,2381
Age 52 (45, 63)
Education
    Associate degree 203 (9.1%)
    Bachelor's degree 1,127 (50%)
    Below some college 54 (2.4%)
    Doctoral degree 484 (22%)
    Master's degree 370 (17%)
Marital Status
    Absurd 2 (<0.1%)
    Alone 3 (0.1%)
    Divorced 232 (10%)
    Married 864 (39%)
    Single 480 (21%)
    Together 580 (26%)
    Widow 77 (3.4%)
Income 51,790 (35,528, 68,307)
Number of Kids
    0 1,291 (58%)
    1 899 (40%)
    2 48 (2.1%)
Number of Teens
    0 1,158 (52%)
    1 1,028 (46%)
    2 52 (2.3%)
Number of Days Since Last Purchase 49 (24, 74)
Complaint (in last 2 yrs) 21 (0.9%)
Wine2 173 (23, 505)
Fruits 8 (1, 33)
Meat Products 67 (16, 232)
Fish Products 12 (3, 50)
Sweet Products 8 (1, 33)
Gold Products 24 (9, 56)
Number of Purchases made with a discount 2 (1, 3)
Customer Accepted the Offer 608 (27%)
Through the Company's Website3 4 (2, 6)
Using a Catelogue 2 (0, 4)
Directlt in Stores 5 (3, 8)
Website Visits (in last month) 6 (3, 7)
1 Median (IQR); n (%)
2 Amount spent on a type of product in last two years
3 Number of purchases made through a way
# --- Big Five ---
# Geographic map
mapworld <- borders("world", colour = "gray50", fill = "white")
big_five %>% ggplot() +
  geom_point(aes(x = long, y = lat, color = country), size = 1.5) + mapworld +
  theme_bw() + theme(legend.position = "none") +
  labs(x = "Longitude", y = "Latitude", title = "Geographic Distribution of Participants (n=1243)")

# Frequency by country
bf_eda <- big_five %>% 
  group_by(country) %>% summarise(tot = n())
bf_eda[order(bf_eda$tot, decreasing = TRUE), ] %>% 
  top_n(50) %>% 
  ggplot(aes(x = tot, y = reorder(country, tot))) +
  geom_bar(stat = "identity") + theme_bw() + 
  theme(legend.position = "none", axis.text.y = element_text(size = 7)) +
  labs(x = "Number of Participants", y = "Country", title = "Number of Participants by Country")

Clustering on customer data

K-means

set.seed(202212)
customer_con <- customer[, c(5:8, 10:16, 18:21)]
customer_scale <- scale(customer_con)

fviz_nbclust(customer_scale, FUNcluster = kmeans, 
             method = "gap_stat", iter.max = 50)

km <- kmeans(customer_scale, centers = 3, nstart = 20)
fviz_cluster(list(data = customer_scale, cluster = km$cluster),
             ellipse.type = "convex", geom = "point",
             labelsize = 5, palette = "Dark2",
             main = "K-means Cluster Plot for Customer Data") + theme_bw()

K-medoids and Gower distance

# Gower distance
res_gower <- daisy(customer, metric = "gower")
summary(res_gower)
## 2503203 dissimilarities, summarized :
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000128 0.1861700 0.2356000 0.2361100 0.2853200 0.5671500 
## Metric :  mixed ;  Types = I, I, N, N, I, I, I, I, N, I, I, I, I, I, I, I, N, I, I, I, I 
## Number of objects : 2238
pam_clust <- pam(as.matrix(res_gower), diss = TRUE, k = 3)

# Find the number of clusters - 3
res_silhouette <- lapply(2:10, function(x) {
  pam_clust <- pam(as.matrix(res_gower), diss = TRUE, k = x)
  silhouette <- pam_clust$silinfo$avg.width
})

do.call(rbind, res_silhouette) %>% 
  as.data.frame() %>% mutate(cluster = c(2:10)) %>% 
  ggplot(aes(x = cluster, y = V1)) +
  geom_line() + theme_bw() +
  labs(x = "Number of clusters k", y = "Sihouette Width", 
       title = "Optimal number of clusters")

km_mod <- pam(as.matrix(res_gower), diss = TRUE, k = 3)

# Clustering Comparison 
fossil::rand.index(km$cluster, km_mod$clustering)
## [1] 0.7355628
# --- Demographic for each cluster ---
tbl_summary(by = clust,
            data = customer %>% 
              mutate(clust = km_mod$clustering,
                     Complain = factor(Complain, levels = c(0, 1), labels = c("No", "Yes")),
                     AcceptedComp = factor(AcceptedComp, levels = c(0, 1), labels = c("No", "Yes")),
                     clust = factor(clust, levels = c(1:3), labels = c("Cluster 1", "Cluster 2", "Cluster 3"))) %>% 
              select(-1),
            label = list(Marital_Status ~ "Marital Status", Kidhome ~ "Number of Kids",
                         Teenhome ~ "Number of Teens", Recency ~ "Number of Days Since Last Purchase",
                         Complain ~ "Complaint (in last 2 yrs)",
                         
                         # products
                         MntWines ~ "Wine", MntFruits ~ "Fruits", MntMeatProducts ~ "Meat Products",
                         MntFishProducts ~ "Fish Products", MntSweetProducts ~ "Sweet Products",
                         MntGoldProds ~ "Gold Products", 
                         
                         # campaign & place
                         NumDealsPurchases ~ "Number of Purchases made with a discount", 
                         AcceptedComp ~ "Customer Accepted the Offer",
                         NumWebPurchases ~ "Through the Company's Website", NumCatalogPurchases ~ "Using a Catelogue",
                         NumStorePurchases ~ "Directlt in Stores", NumWebVisitsMonth ~ "Website Visits (in last month)")) %>% 
  add_p(list(all_categorical() ~ "chisq.test", all_continuous() ~ "aov")) %>% add_overall() %>% 
  modify_table_styling(footnote = "Amount spent on a type of product in last two years",
                       rows = (label == "Wine"), columns = label) %>% 
  modify_table_styling(footnote = "Number of purchases made through a way",
                       rows = (label == "Through the Company's Website"),
                       columns = label) %>% 
  modify_caption("**Demographics of Customers for each K-medoids Cluster (n=2238)**") %>% 
  bold_labels()
Demographics of Customers for each K-medoids Cluster (n=2238)
Characteristic Overall, N = 2,2381 Cluster 1, N = 6651 Cluster 2, N = 9071 Cluster 3, N = 6661 p-value2
Age 52 (45, 63) 57 (46, 66) 48 (41, 54) 57 (49, 66) <0.001
Education <0.001
    Associate degree 203 (9.1%) 60 (9.0%) 97 (11%) 46 (6.9%)
    Bachelor's degree 1,127 (50%) 454 (68%) 546 (60%) 127 (19%)
    Below some college 54 (2.4%) 3 (0.5%) 49 (5.4%) 2 (0.3%)
    Doctoral degree 484 (22%) 46 (6.9%) 71 (7.8%) 367 (55%)
    Master's degree 370 (17%) 102 (15%) 144 (16%) 124 (19%)
Marital Status <0.001
    Absurd 2 (<0.1%) 2 (0.3%) 0 (0%) 0 (0%)
    Alone 3 (0.1%) 0 (0%) 1 (0.1%) 2 (0.3%)
    Divorced 232 (10%) 69 (10%) 91 (10%) 72 (11%)
    Married 864 (39%) 110 (17%) 386 (43%) 368 (55%)
    Single 480 (21%) 155 (23%) 214 (24%) 111 (17%)
    Together 580 (26%) 299 (45%) 195 (21%) 86 (13%)
    Widow 77 (3.4%) 30 (4.5%) 20 (2.2%) 27 (4.1%)
Income 51,790 (35,528, 68,307) 70,713 (59,925, 79,803) 34,176 (25,238, 42,394) 58,100 (48,918, 67,442) <0.001
Number of Kids <0.001
    0 1,291 (58%) 621 (93%) 142 (16%) 528 (79%)
    1 899 (40%) 44 (6.6%) 729 (80%) 126 (19%)
    2 48 (2.1%) 0 (0%) 36 (4.0%) 12 (1.8%)
Number of Teens <0.001
    0 1,158 (52%) 442 (66%) 590 (65%) 126 (19%)
    1 1,028 (46%) 216 (32%) 305 (34%) 507 (76%)
    2 52 (2.3%) 7 (1.1%) 12 (1.3%) 33 (5.0%)
Number of Days Since Last Purchase 49 (24, 74) 46 (21, 72) 50 (26, 77) 52 (25, 73) 0.044
Complaint (in last 2 yrs) 21 (0.9%) 7 (1.1%) 12 (1.3%) 2 (0.3%) 0.11
Wine3 173 (23, 505) 458 (265, 722) 19 (7, 56) 356 (134, 638) <0.001
Fruits 8 (1, 33) 36 (19, 80) 3 (1, 7) 9 (1, 32) <0.001
Meat Products 67 (16, 232) 319 (153, 534) 15 (8, 34) 92 (33, 184) <0.001
Fish Products 12 (3, 50) 63 (28, 125) 4 (2, 12) 12 (2, 40) <0.001
Sweet Products 8 (1, 33) 38 (17, 85) 3 (1, 8) 8 (0, 34) <0.001
Gold Products 24 (9, 56) 56 (31, 111) 11 (5, 24) 26 (10, 58) <0.001
Number of Purchases made with a discount 2 (1, 3) 1 (1, 2) 2 (1, 3) 2 (1, 4) <0.001
Customer Accepted the Offer 608 (27%) 271 (41%) 153 (17%) 184 (28%) <0.001
Through the Company's Website4 4 (2, 6) 5 (3, 7) 2 (1, 3) 5 (3, 7) <0.001
Using a Catelogue 2 (0, 4) 5 (3, 7) 0 (0, 1) 2 (1, 4) <0.001
Directlt in Stores 5 (3, 8) 8 (6, 10) 3 (3, 4) 6 (4, 9) <0.001
Website Visits (in last month) 6 (3, 7) 3 (2, 5) 7 (6, 8) 5 (4, 7) <0.001
1 Median (IQR); n (%)
2 One-way ANOVA; Pearson's Chi-squared test
3 Amount spent on a type of product in last two years
4 Number of purchases made through a way
# --- Heatmap ---
# Products
htp_1 <- customer %>% select(10:15) %>% 
  mutate(cluster = km_mod$clustering) %>% 
  pivot_longer(
    1:6, names_to = "product", values_to = "value"
  ) %>% 
  group_by(cluster, product) %>% 
  summarise(amount = mean(value)) %>% 
  ggplot(aes(x = cluster, y = product, fill = amount)) +
  geom_tile() + theme_bw() +
  theme(legend.position = "bottom") +
  scale_y_discrete(labels = c("Fish", "Fruit", "Gold", "Meat", "Sweet", "Wine")) +
  scale_fill_distiller(palette = "OrRd") +
  labs(x = "Cluster", y = "Product")

# Places
htp_2 <- customer %>% select(18:20) %>% 
  mutate(cluster = km_mod$clustering) %>% 
  pivot_longer(
    1:3, names_to = "place", values_to = "value"
  ) %>% 
  group_by(cluster, place) %>% 
  summarise(number = mean(value)) %>% 
  ggplot(aes(x = cluster, y = place, fill = number)) +
  geom_tile() + theme_bw() +
  theme(legend.position = "bottom") +
  scale_y_discrete(labels = c("Catalog", "Store", "Website")) +
  scale_fill_distiller(palette = "OrRd") +
  labs(x = "Cluster", y = "Place")

htp_1 + htp_2 +
  plot_annotation(title = "Heatmap of Amount Spend and Number of Visits by K-medoids Cluster")

# --- Networks for continuous vars --- 
lapply(1:length(unique(km_mod$clustering)), function(x) {
  print(paste("Cluster", x, "with", sum(km$cluster == x), "subjects"))
  
  cl <- customer_con[km_mod$clustering == x,]
  net <- estimateNetwork(as.matrix(cl), default = "EBICglasso")
  qgraph(net$graph, labels = names(cl), layout = "spring",
         theme = "TeamFortress", label.cex = 1.5,
         label.fill.horizontal = .7)
})
## [1] "Cluster 1 with 1045 subjects"

## [1] "Cluster 2 with 584 subjects"

## [1] "Cluster 3 with 609 subjects"

## [[1]]
## From     To  Weight
## 2     ---     3   -0.03 
## 1     ---     5   0.28 
## 1     ---     6   0.05 
## 3     ---     6   -0.04 
## 1     ---     7   0.29 
## 3     ---     7   -0.23 
## 5     ---     7   0.05 
## 6     ---     7   0.05 
## 1     ---     8   0.04 
## 3     ---     8   -0.06 
## 6     ---     8   0.19 
## 7     ---     8   0.06 
## 1     ---     9   0.12 
## 4     ---     9   0.05 
## 6     ---     9   0.18 
## 7     ---     9   0.02 
## 8     ---     9   0.18 
## 4     ---     10      0.01 
## 6     ---     10      0.09 
## 8     ---     10      0.09 
## 1     ---     11      -0.02 
## 2     ---     11      0.4 
## 3     ---     11      0.38 
## 1     ---     12      0.12 
## 3     ---     12      0.01 
## 4     ---     12      0 
## 5     ---     12      0.14 
## 8     ---     12      0.01 
## 9     ---     12      0.07 
## 10    ---     12      0.03 
## 11    ---     12      0.02 
## 1     ---     13      0.29 
## 3     ---     13      -0.02 
## 5     ---     13      0.07 
## 7     ---     13      0.13 
## 8     ---     13      0.04 
## 9     ---     13      0.04 
## 10    ---     13      0.07 
## 1     ---     14      0.12 
## 5     ---     14      0.13 
## 10    ---     14      0.02 
## 12    ---     14      0.08 
## 1     ---     15      -0.38 
## 2     ---     15      0.04 
## 3     ---     15      0.06 
## 4     ---     15      -0.02 
## 5     ---     15      0.06 
## 6     ---     15      -0.03 
## 7     ---     15      -0.03 
## 8     ---     15      -0.04 
## 9     ---     15      -0.01 
## 11    ---     15      0.22 
## 12    ---     15      0.44 
## 13    ---     15      -0.05 
## 14    ---     15      0 
## 
## [[2]]
## From     To  Weight
## 1     ---     2   0.03 
## 1     ---     3   0.1 
## 2     ---     3   0 
## 3     ---     4   0.03 
## 1     ---     5   0.15 
## 3     ---     5   0.06 
## 1     ---     6   0.01 
## 3     ---     6   -0.07 
## 4     ---     6   0.01 
## 5     ---     6   0.03 
## 3     ---     7   -0.05 
## 5     ---     7   0.07 
## 6     ---     7   0.07 
## 2     ---     8   -0.04 
## 3     ---     8   -0.07 
## 5     ---     8   0.06 
## 6     ---     8   0.29 
## 7     ---     8   0.04 
## 1     ---     9   0.01 
## 3     ---     9   -0.04 
## 5     ---     9   -0.12 
## 6     ---     9   0.19 
## 7     ---     9   0.07 
## 8     ---     9   0.19 
## 2     ---     10      -0.07 
## 3     ---     10      -0.02 
## 5     ---     10      0.09 
## 8     ---     10      0.06 
## 9     ---     10      0.1 
## 2     ---     11      0.14 
## 3     ---     11      0.25 
## 5     ---     11      -0.05 
## 6     ---     11      -0.04 
## 7     ---     11      0.09 
## 8     ---     11      -0.02 
## 9     ---     11      -0.12 
## 1     ---     12      0.03 
## 5     ---     12      0.09 
## 9     ---     12      0.3 
## 10    ---     12      0.48 
## 11    ---     12      0.22 
## 2     ---     13      -0.01 
## 5     ---     13      0.13 
## 6     ---     13      0.02 
## 7     ---     13      0.71 
## 8     ---     13      0.03 
## 10    ---     13      0.05 
## 11    ---     13      0.16 
## 12    ---     13      -0.03 
## 1     ---     14      0.04 
## 5     ---     14      0.47 
## 6     ---     14      0.2 
## 7     ---     14      0.08 
## 8     ---     14      0.12 
## 9     ---     14      0.1 
## 10    ---     14      -0.11 
## 11    ---     14      0.19 
## 12    ---     14      0.12 
## 13    ---     14      -0.11 
## 1     ---     15      -0.15 
## 3     ---     15      -0.13 
## 5     ---     15      0.1 
## 6     ---     15      0 
## 9     ---     15      -0.03 
## 11    ---     15      0.27 
## 12    ---     15      0.03 
## 13    ---     15      -0.12 
## 14    ---     15      -0.19 
## 
## [[3]]
## From     To  Weight
## 2     ---     3   -0.02 
## 1     ---     5   0.22 
## 2     ---     5   -0.07 
## 3     ---     5   -0.09 
## 3     ---     6   -0.03 
## 1     ---     7   0.27 
## 3     ---     7   -0.2 
## 4     ---     7   0.04 
## 5     ---     7   0.01 
## 6     ---     7   0.14 
## 3     ---     8   -0.07 
## 5     ---     8   -0.01 
## 6     ---     8   0.28 
## 7     ---     8   0.06 
## 1     ---     9   0.05 
## 2     ---     9   -0.01 
## 3     ---     9   -0.03 
## 4     ---     9   -0.01 
## 5     ---     9   -0.03 
## 6     ---     9   0.27 
## 7     ---     9   0.03 
## 8     ---     9   0.27 
## 2     ---     10      -0.04 
## 3     ---     10      0.08 
## 4     ---     10      0.01 
## 5     ---     10      0.08 
## 6     ---     10      0.05 
## 8     ---     10      0.11 
## 9     ---     10      0.03 
## 2     ---     11      0.28 
## 3     ---     11      0.19 
## 6     ---     11      -0.05 
## 7     ---     11      0.04 
## 8     ---     11      -0.04 
## 10    ---     11      0.08 
## 1     ---     12      0.11 
## 2     ---     12      -0.08 
## 3     ---     12      0.04 
## 4     ---     12      0.02 
## 5     ---     12      0.23 
## 9     ---     12      0.07 
## 10    ---     12      0.16 
## 11    ---     12      0.04 
## 1     ---     13      0.13 
## 2     ---     13      -0.12 
## 4     ---     13      0.02 
## 5     ---     13      0.23 
## 7     ---     13      0.38 
## 8     ---     13      0.08 
## 10    ---     13      0.05 
## 11    ---     13      0.16 
## 2     ---     14      -0.15 
## 5     ---     14      0.24 
## 6     ---     14      0.13 
## 8     ---     14      0.08 
## 9     ---     14      0.08 
## 11    ---     14      0.11 
## 12    ---     14      0.12 
## 1     ---     15      -0.37 
## 2     ---     15      0.07 
## 3     ---     15      0.05 
## 5     ---     15      0.17 
## 7     ---     15      -0.04 
## 8     ---     15      -0.03 
## 9     ---     15      -0.08 
## 11    ---     15      0.26 
## 12    ---     15      0.21 
## 13    ---     15      -0.06 
## 14    ---     15      -0.08

Clustering on big five

Latent class analysis (LCA)

# Try #class from 2 to 8
lca_dat <- big_five[, 2:51] %>% mutate_all(.funs = ~as.numeric(.))
f <- formula(paste("cbind(", paste(colnames(lca_dat), collapse = ","), ") ~ 1"))

lca_res <- mclapply(2:8, function(x) {
  set.seed(202212)
  lca <- poLCA(f, lca_dat, nclass = x, verbose = FALSE, nrep = 20)
  return(lca)
}, mc.cores = num.cores)

df_bic <- data.frame(Class = 2:8, BIC = unlist(lapply(lca_res, function(obj) obj$bic)))
df_bic %>% 
  ggplot(aes(x = Class, y = BIC, label = round(BIC))) +
  geom_line() + geom_point() +
  geom_text(vjust = -.35) + theme_bw() +
  labs(title = "BIC for Latent Class Analysis for Big Five Data")

# Choose 5 classes with smallest BIC
n_class <- df_bic$Class[which.min(df_bic$BIC)]
fit_lca <- lca_res[[n_class - 1]]

Answers for each class

# --- Alluvial plots ---
# rename
score <- c("Excellent", "Very Good", "Good", "Fair", "Poor")
col_names <- c("PartyLife", "TalkLess", "SocialComfort", "Invisible", "TopicStarter", 
               "SayLittle", "ExtensiveContact", "AvoidAttention", "CentralPerson", "Quiet",
               "Stressed", "Relaxed", "Worried", "FeelBlueLess", "Distubed",
               "GetUpset", "MoodChange", "FreqMoodSwing", "Irritated", "FeelBlue",
               "ConcernLess", "Interested", "Insult", "Sympathize", "NotInterestProb",
               "SoftHeart", "NotInterestPpl", "TakeTime", "FeelEmo", "MakeEase",
               "Prepared", "LeaveBelonging", "Meticulous", "MakeMess", "NoDelay",
               "Forgetful", "LikeOrder", "ShirkDuty", "FollowSche", "Exacting",
               "RichVocal", "DiffAbstract", "ViviImag", "NoInAbstract", "ExcelIdea",
               "NoImag", "QuickUnderstand", "DiffWord", "SlowReflect", "FullIdea")
five_factor <- data.frame(
  fact_name = c("Extraversion", "EmoStablility", "Agreeableness", "Conscientious", "Intellect"),
  factr = c("EXT", "EST", "AGR", "CSN", "OPN"))
# set pos or neg answers
dirct <- data.frame(variable = c(paste("EXT", seq(10), sep = ""), 
                                 paste("EST", seq(10), sep = ""),
                                 paste("AGR", seq(10), sep = ""),
                                 paste("CSN", seq(10), sep = ""),
                                 paste("OPN", seq(10), sep = "")),
                    dirct = c(rep(c(1, 0), 5),
                              0, 1, 0, 1, rep(0, 6),
                              0, 1, 0, 1, 0, 1, 0, 1, 1, 1,
                              1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
                              1, 0, 1, 0, 1, 0, 1, 1, 1, 1))
# combine lca clusters
dat_clust <- lca_dat %>% 
  mutate(clust = fit_lca$predclass)

dat_forshow <- dat_clust %>% 
  pivot_longer(1:50, names_to = "variable", values_to = "value") %>% 
  group_by(variable, clust, value) %>% 
  summarise(tot = n()) %>% 
  pivot_wider(c(1:2), names_from = "value", values_from = "tot")
dat_forshow[is.na(dat_forshow)] <- 0

# change frequency to percentage
# convert answer to score: subtract 6 by answers of negative variables
tot_dat <- dat_forshow %>% 
  pivot_longer(3:7, names_to = "answer", values_to = "number") %>% 
  group_by(variable, answer) %>% summarise(tot = sum(number)) %>% 
  mutate(comb = paste(variable, answer, sep = ""))
dat_forshow_1 <- dat_forshow %>% 
  pivot_longer(3:7, names_to = "answer", values_to = "number") %>% 
  mutate(comb = paste(variable, answer, sep = ""))
dat_forshow_1 <- left_join(dat_forshow_1, dirct, by = "variable")
dat_forshow_2 <- dat_forshow_1 %>% 
  mutate(
    answer = as.numeric(answer),
    score = ifelse(dirct == 0, 6 - answer, answer)) %>% 
  select(1, 2, 7, 4, 5)

# change label name
dat_forshow_3 <- left_join(dat_forshow_2, 
                           tot_dat %>% select(3, 4), by = "comb") %>% 
  mutate(variable = variable.x, prct = number/tot * 100) %>% 
  select(8, 2:3, 9) %>% pivot_wider(names_from = "score", values_from = "prct")

# change factor name
dat_forshow_4 <- dat_forshow_3 %>% 
  mutate(factr = substr(variable, 1, 3))
dat_forshow_5 <- 
  left_join(dat_forshow_4, 
            data.frame(name = col_names, variable = c(paste("EXT", seq(10), sep = ""), 
                                                      paste("EST", seq(10), sep = ""),
                                                      paste("AGR", seq(10), sep = ""),
                                                      paste("CSN", seq(10), sep = ""),
                                                      paste("OPN", seq(10), sep = ""))),
            by = "variable")

dat_forshow_fin <- left_join(dat_forshow_5, five_factor, by = "factr")
make_alluvial <- function(x){
  dat_forshow_fin[, -c(1, 8)] %>% 
    pivot_longer(c(2:6), names_to = "Score", values_to = "Prct") %>% 
    mutate(clust = as.factor(clust),
           Score = factor(Score, levels = c(5:1), labels = score)) %>% 
    filter(str_detect(fact_name, x)) %>% 
    ggplot(aes(y = Prct, axis1 = name, axis2 = Score, axis3 = clust)) +
    geom_alluvium(aes(fill = clust), width = 1/12) +
    geom_stratum(width = 1/12, fill = "black", color = "grey") +
    geom_label(stat = "stratum", aes(label = after_stat(stratum))) +
    scale_x_discrete(limits = c("Question", "Score", "Cluster"), expand = c(.09, .09)) +
    scale_y_continuous(breaks = NULL) +
    scale_fill_brewer(type = "qual", palette = "Dark2") +
    guides(fill = guide_legend(title = "Cluster")) +
    theme_bw() +
    labs(y = "Percentage",
         title = paste("Alluvial Plot of", x, "by Cluster"))
}

# plot alluvial
lapply(unique(dat_forshow_fin$fact_name), function(x){
  make_alluvial(x)})

Factor-lead subgroups

# --- cluster 4: excellent on all five ---
clust_4 <- big_five %>% 
  mutate(clust = fit_lca$predclass) %>% 
  filter(clust == 4)

# by continent
clust_4 %>% 
  group_by(continent) %>% summarise(tot = n()) %>% 
  mutate(prct = tot/sum(fit_lca$predclass == 4) * 100) %>% 
  ggplot(aes(x = prct, y = continent)) +
  geom_bar(stat = "identity")

# --- Cluster 5: most worst group---
clust_5 <- big_five %>% 
  mutate(clust = fit_lca$predclass) %>% 
  filter(clust == 5)

clust_5 %>% 
  group_by(continent) %>% summarise(tot = n()) %>% 
  mutate(prct = tot/sum(fit_lca$predclass == 5) * 100) %>% 
  ggplot(aes(x = prct, y = continent)) +
  geom_bar(stat = "identity")